Source code for dnadna.params

"""Management of parameters to be learned in a training run."""


import warnings

import torch
import torch.nn as nn

from . import DNADNAWarning
from .utils.config import ConfigMixIn
from .utils.decorators import cached_property
from .utils.misc import unique


# NOTE: All loss functions built into pytorch are named like SomethingLoss;
# This just makes a mapping of all lower-case versions of the function names
# (for case-insensitive comparison) minus the redundant "Loss" part to the
# functions themselves (all classes actually)
# TODO: Many of the loss functions in PyTorch are parameterized themselves, and
# we need an interface for specifying loss function parameters.
# TODO: The current code just allows any loss function defined in PyTorch;
# should we restrict this to some specific ones?
LOSS_FUNCS = None


[docs]class MCELoss(nn.Module): """Mean Circular Error"""
[docs] def forward(self, x, y): return torch.min((x - y) % 1, (y - x) % 1).mean().unsqueeze(0)
def _get_loss_funcs(): global LOSS_FUNCS LOSS_FUNCS = {'mce': MCELoss} for name, cls in vars(nn).items(): if not name[0] == '_' and name.endswith('Loss'): LOSS_FUNCS[name.lower()[:-4]] = cls _get_loss_funcs()
[docs]class ParamSet(ConfigMixIn): """ Class for managing a set of model parameters as defined by the :ref:`param set schema <schema-param-set>`. In most cases this is not used directly; instead the `LearnedParams` subclass of this is used to manage a parameter set used for model training. This base class implements more basic functionality that does not depend on a ``scenario_params`` table and does not support training-specific functionality like `LearnedParams`. """ config_schema = 'param-set' # TODO: Default loss funcs should come directly out of the schema but # that doesn't likely work yet. default_loss_funcs = { 'regression': 'MSE', 'classification': 'Cross Entropy' } def __init__(self, config={}, validate=True): # Special case: the training.yml schema allows learned_params to be # a list of single-element mappings (how PyYAML loads ordered mappings) # In this case we build a dict from the list, taking advantage of # insertion order preservation of plain dicts in Python 3.6+ # See https://gitlab.inria.fr/ml_genetics/private/dnadna/-/issues/46 if isinstance(config, list): new_config = {} for param in config: if len(param) != 1: raise ValueError( f"when {self.__class__.__name__} is given a list of " f"parameters, each item must be a single-element " f"dict keyed by the parameter name") param_name, param = list(param.items())[0] new_config[param_name] = param config = new_config super().__init__(config, validate=validate) self._normalize_params()
[docs] @cached_property def params(self): """ Returns a dict of params. Examples -------- This example also demonstrates that the param details are also filled out with defaults not included in the original config (``loss_weight``, ``n_classes``, etc.): >>> from dnadna.params import ParamSet >>> config = { ... 'b': {'type': 'regression', 'loss_func': 'MSE'}, ... # 'x' gets the default loss func, cross entropy ... 'x': {'type': 'classification', 'classes': 3}, ... # 'a' gets the default loss func, MSE ... 'a': {'type': 'regression'}, ... 'z': {'type': 'classification', 'classes': ['a', 'b'], ... 'loss_func': 'Cross Entropy'}, ... 'y': {'type': 'regression', 'loss_func': 'MCE'} ... } >>> learned_params = ParamSet(config) >>> learned_params.params {'b': {'type': 'regression', 'loss_func': 'MSE', 'loss_weight': 1, 'log_transform': False, 'tied_to_position': False}, 'x': {'type': 'classification', 'classes': ['0', '1', '2'], 'loss_func': 'Cross Entropy', 'loss_weight': 1, 'n_classes': 3}, 'a': {'type': 'regression', 'loss_func': 'MSE', 'loss_weight': 1, 'log_transform': False, 'tied_to_position': False}, 'z': {'type': 'classification', 'classes': ['a', 'b'], 'loss_func': 'Cross Entropy', 'loss_weight': 1, 'n_classes': 2}, 'y': {'type': 'regression', 'loss_func': 'MCE', 'loss_weight': 1, 'log_transform': False, 'tied_to_position': False}} """ return self.config.dict()
[docs] @cached_property def param_names(self): """ Return list of names of all learned parameters. Parameters are given in the order they were specified in the config. If an unordered mapping was used, the order will still be preserved since Python 3.6+ preserves dictionary insertion order and this holds when loading mappings from YAML and JSON. It is also possible to instantiate the `ParamSet` with a list of mappings, each containing one element. This is how `omaps <https://yaml.org/type/omap.html>`_ (ordered mappings) are encoded in YAML, and the same structure can be used in JSON. In this case the order of the elements in the list are preserved. Examples -------- >>> from dnadna.params import ParamSet >>> config = { ... 'b': {'type': 'regression', 'loss_func': 'MSE'}, ... # 'x' gets the default loss func, cross entropy ... 'x': {'type': 'classification', 'classes': 3}, ... # 'a' gets the default loss func, MSE ... 'a': {'type': 'regression'}, ... 'z': {'type': 'classification', 'classes': 3, ... 'loss_func': 'Cross Entropy'}, ... 'y': {'type': 'regression', 'loss_func': 'MCE'} ... } >>> learned_params = ParamSet(config) >>> learned_params.param_names ['b', 'x', 'a', 'z', 'y'] Similar (shorter) example, but from a list: >>> config = [ ... {'b': {'type': 'regression', 'loss_func': 'MSE'}}, ... {'x': {'type': 'classification', 'classes': 3}} ... ] ... >>> learned_params = ParamSet(config) >>> learned_params.param_names ['b', 'x'] """ return list(self.params)
@property def regression_params(self): return {k: v for k, v in self.params.items() if v['type'] == 'regression'} @property def classification_params(self): return {k: v for k, v in self.params.items() if v['type'] == 'classification'}
[docs] @cached_property def n_outputs(self): """ Expected number of outputs for a network that is trained for this set of parameters. This should be one output per regression parameter, and one output *per class* of each classification parameter, giving the likelihood of an input falling into that class. """ tot_classes = sum(p['n_classes'] for p in self.classification_params.values()) return len(self.regression_params) + tot_classes
[docs] @cached_property def param_slices(self): """ Returns a dict mapping parameter names to tuples of slices. The slice to take of the targets tensor for values of that parameter, and the slice to take of the outputs tensor for that parameter. This is used during training: The ``targets`` tensor contains (for a batch of one or more scenarios) the known target values for each parameter being learned, and the ``outputs`` tensor contains the predicted values for each parameter returned from the model being trained (where in ``outputs`` there is a predicted likelihood for each category in a classification parameter). Examples -------- >>> from dnadna.params import ParamSet >>> import pandas as pd >>> config = { ... 'b': {'type': 'regression', 'loss_func': 'MSE'}, ... # 'x' gets the default loss func, cross entropy ... 'x': {'type': 'classification', 'classes': 3}, ... # 'a' gets the default loss func, MSE ... 'a': {'type': 'regression'}, ... 'z': {'type': 'classification', 'classes': 3, ... 'loss_func': 'Cross Entropy'}, ... 'y': {'type': 'regression', 'loss_func': 'MCE'} ... } >>> learned_params = ParamSet(config) >>> learned_params.param_slices {'b': (slice(0, 1, None), slice(0, 1, None)), 'x': (slice(1, 2, None), slice(1, 4, None)), 'a': (slice(2, 3, None), slice(4, 5, None)), 'z': (slice(3, 4, None), slice(5, 8, None)), 'y': (slice(4, 5, None), slice(8, 9, None))} """ targets_idx = 0 outputs_idx = 0 slices = {} for param_name in self.param_names: param = self.params[param_name] if param['type'] == 'regression': slices[param_name] = ( slice(targets_idx, targets_idx + 1), slice(outputs_idx, outputs_idx + 1) ) targets_idx += 1 outputs_idx += 1 else: # classification n_classes = param['n_classes'] slices[param_name] = ( slice(targets_idx, targets_idx + 1), slice(outputs_idx, outputs_idx + n_classes) ) targets_idx += 1 outputs_idx += n_classes return slices
def _normalize_params(self): """ After being initialized with some parameter configurations, this performs additional post-initialization normalization of the parameter configurations. Currently this normalizes the classes of classification params--the configuration may contain either an integer (giving the number of classes) or a list of strings (or ints) giving labels to the classes. For each classification parameter we add an ``'n_classes'`` property, which may be equivalent to ``'classes'`` (in the integer case). In the case where ``'classes'`` is a list of labels, we also normalize so that all the labels are unique strings. It also sets the default value for ``'loss_func'`` property on each parameter that lacks it. This should be performed automatically via the schema, but that is not working yet. """ for param_name, param in self.classification_params.items(): orig_classes = param['classes'] if isinstance(orig_classes, list): # We want to preserve order so a set isn't used classes = unique(map(str, orig_classes)) if len(classes) != len(orig_classes): warnings.warn( f'classifiation parameter {param_name} has duplicates ' f'in its list of class labels; the class labels have ' f'been normalized to be unique: {classes}', DNADNAWarning) param['n_classes'] = len(classes) param['classes'] = classes else: param['n_classes'] = param['classes'] # The class names are just numbered param['classes'] = [str(cls) for cls in range(param['classes'])] for param in self.params.values(): param.setdefault('loss_func', self.default_loss_funcs[param['type']])
[docs]class LearnedParams(ParamSet): """ Class for managing the parameters on which a model is trained. Attributes ---------- config : `~dnadna.utils.config.Config` or `dict` The learned parameters configuration, confirming to the :ref:`learned parameters schema <schema-param-set>`. """ def __init__(self, config, scenario_params, validate=True): # If the config was originally given as a list (see ParamSet.__init__), # we keep its existing order. Otherwise, parameters are re-ordered # according to their order in the scenario_params table. if not isinstance(config, list): new_config = {} for param_name in scenario_params.columns: if param_name in config: new_config[param_name] = config[param_name] config = new_config super().__init__(config, validate=validate) self._scenario_params = scenario_params @property def scenario_params(self): """ The :ref:`scenario parameters table <dnadna-dataset-scenario-params>`, as a `pandas.DataFrame` giving the known values of parameters the dataset is being trained on for all scenarios the dataset is being trained on. """ return self._scenario_params
[docs] @cached_property def loss_funcs(self): """Maps parameter names to their loss functions.""" loss_funcs = {} for param_name, param in self.params.items(): loss_func = param['loss_func'] loss_func_cls = self._normalize_loss_func(loss_func) if loss_func is nn.CrossEntropyLoss: # NOTE: Here we make a special case, but there are several # other loss functions that take some class weights as an # argument, so perhaps we could generalize this a bit if # desired. This is also very specific to classification # parameters, which might help later for generalization w = self.scenario_params.groupby(param_name).size() # inverse of the frequency. Smaller class should weight more in # the loss.size() weight = torch.Tensor(w.sum() / w) loss = loss_func_cls(weight=weight) else: loss = loss_func_cls() loss_funcs[param_name] = loss return loss_funcs
[docs] @cached_property def loss_weights(self): """Maps parameter names to their loss weights (if any).""" loss_weights = {} for param_name, param in self.params.items(): # loss_weights should be optional if param['type'] == "classification": weight = torch.Tensor([1 / param['n_classes']]) else: bad_val = self.scenario_params[param_name].isnull().sum() freq_ok = 1 - bad_val / len(self.scenario_params) weight = torch.Tensor([1 / freq_ok]) weight *= param['loss_weight'] loss_weights[param_name] = weight return loss_weights
[docs] def to(self, device=None): r""" Similarly to `torch.nn.Module.to` with ``device=device``, moves all `loss_weights` and the parameters of all `loss_funcs` to the specified device. """ if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu' for param, lw in self.loss_weights.items(): self.loss_weights[param] = lw.to(device) for lf in self.loss_funcs.values(): lf.to(device) return self
@staticmethod def _normalize_loss_func(loss_func): """ Maps a loss function name (as given in the configuration) to the actual function or class which implements that function. The loss function name may be case-insensitive, and contain spaces (which are removed). Currently, the available loss functions are any defined in the `torch.nn` module with a name ending in ``"Loss"``, or additional loss functions defined in this module. In the future this will be made more extensible as needed. Examples -------- >>> from dnadna.params import LearnedParams >>> LearnedParams._normalize_loss_func('MSE') <class 'torch.nn.modules.loss.MSELoss'> >>> LearnedParams._normalize_loss_func('cross entropy') <class 'torch.nn.modules.loss.CrossEntropyLoss'> >>> LearnedParams._normalize_loss_func('MCE') <class 'dnadna.params.MCELoss'> """ try: return LOSS_FUNCS[loss_func.lower().replace(' ', '')] except KeyError: raise ValueError( f'unknown loss function: {loss_func}; must be ' f'one of {", ".join(sorted(LOSS_FUNCS))}')